-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[SLPVectorizer] Support SLPVectorizer cases of tan across all backends #95517
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-webassembly @llvm/pr-subscribers-llvm-transforms Author: Farzon Lotfi (farzonl) ChangesAdd a default f16 type promotion Full diff: https://github.com/llvm/llvm-project/pull/95517.diff 10 Files Affected:
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index f5da222d11f55..ce0cc38baf77e 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -415,10 +415,12 @@ class TargetLibraryInfo {
return false;
switch (F) {
default: break;
+ // clang-format off
case LibFunc_copysign: case LibFunc_copysignf: case LibFunc_copysignl:
case LibFunc_fabs: case LibFunc_fabsf: case LibFunc_fabsl:
case LibFunc_sin: case LibFunc_sinf: case LibFunc_sinl:
case LibFunc_cos: case LibFunc_cosf: case LibFunc_cosl:
+ case LibFunc_tan: case LibFunc_tanf: case LibFunc_tanl:
case LibFunc_sqrt: case LibFunc_sqrtf: case LibFunc_sqrtl:
case LibFunc_sqrt_finite: case LibFunc_sqrtf_finite:
case LibFunc_sqrtl_finite:
@@ -437,6 +439,7 @@ class TargetLibraryInfo {
case LibFunc_memcmp: case LibFunc_bcmp: case LibFunc_strcmp:
case LibFunc_strcpy: case LibFunc_stpcpy: case LibFunc_strlen:
case LibFunc_strnlen: case LibFunc_memchr: case LibFunc_mempcpy:
+ // clang-format on
return true;
}
return false;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7828bdc1f1f43..b1d426830f0da 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -156,14 +156,17 @@ class TargetTransformInfoImplBase {
StringRef Name = F->getName();
// These will all likely lower to a single selection DAG node.
+ // clang-format off
if (Name == "copysign" || Name == "copysignf" || Name == "copysignl" ||
- Name == "fabs" || Name == "fabsf" || Name == "fabsl" || Name == "sin" ||
+ Name == "fabs" || Name == "fabsf" || Name == "fabsl" ||
Name == "fmin" || Name == "fminf" || Name == "fminl" ||
Name == "fmax" || Name == "fmaxf" || Name == "fmaxl" ||
- Name == "sinf" || Name == "sinl" || Name == "cos" || Name == "cosf" ||
- Name == "cosl" || Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl")
+ Name == "sin" || Name == "sinf" || Name == "sinl" ||
+ Name == "cos" || Name == "cosf" || Name == "cosl" ||
+ Name == "tan" || Name == "tanf" || Name == "tanl" ||
+ Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl")
return false;
-
+ // clang-format on
// These are all likely to be optimized into something smaller.
if (Name == "pow" || Name == "powf" || Name == "powl" || Name == "exp2" ||
Name == "exp2l" || Name == "exp2f" || Name == "floor" ||
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 8126d2a1acc27..0a477082a7d12 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -3993,6 +3993,10 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(const CallBase &CB,
case LibFunc_cosf:
case LibFunc_cosl:
return Intrinsic::cos;
+ case LibFunc_tan:
+ case LibFunc_tanf:
+ case LibFunc_tanl:
+ return Intrinsic::tan;
case LibFunc_exp:
case LibFunc_expf:
case LibFunc_expl:
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 8240a1fd7e2ff..de534994fa48c 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -961,7 +961,7 @@ void TargetLoweringBase::initActions() {
setOperationAction(
{ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG, ISD::ANY_EXTEND_VECTOR_INREG,
ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG,
- ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT},
+ ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::FTAN},
VT, Expand);
// Constrained floating-point operations default to expand.
@@ -1020,6 +1020,7 @@ void TargetLoweringBase::initActions() {
ISD::FTAN},
{MVT::f32, MVT::f64, MVT::f128}, Expand);
+ setOperationAction(ISD::FTAN, MVT::f16, Promote);
// Default ISD::TRAP to expand (which turns it into abort).
setOperationAction(ISD::TRAP, MVT::Other, Expand);
diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
index c493a9b2cb1df..bfc26b0d65980 100644
--- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
@@ -2862,3 +2862,123 @@ define i1 @isnan_d_fpclass(half %x) {
%1 = call i1 @llvm.is.fpclass.f16(half %x, i32 3) ; nan
ret i1 %1
}
+
+declare half @llvm.tan.f16(half)
+
+define half @tan_f16(half %a) nounwind {
+; RV32IZFH-LABEL: tan_f16:
+; RV32IZFH: # %bb.0:
+; RV32IZFH-NEXT: addi sp, sp, -16
+; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT: fcvt.s.h fa0, fa0
+; RV32IZFH-NEXT: call tanf
+; RV32IZFH-NEXT: fcvt.h.s fa0, fa0
+; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT: addi sp, sp, 16
+; RV32IZFH-NEXT: ret
+;
+; RV64IZFH-LABEL: tan_f16:
+; RV64IZFH: # %bb.0:
+; RV64IZFH-NEXT: addi sp, sp, -16
+; RV64IZFH-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IZFH-NEXT: fcvt.s.h fa0, fa0
+; RV64IZFH-NEXT: call tanf
+; RV64IZFH-NEXT: fcvt.h.s fa0, fa0
+; RV64IZFH-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IZFH-NEXT: addi sp, sp, 16
+; RV64IZFH-NEXT: ret
+;
+; RV32IZHINX-LABEL: tan_f16:
+; RV32IZHINX: # %bb.0:
+; RV32IZHINX-NEXT: addi sp, sp, -16
+; RV32IZHINX-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZHINX-NEXT: fcvt.s.h a0, a0
+; RV32IZHINX-NEXT: call tanf
+; RV32IZHINX-NEXT: fcvt.h.s a0, a0
+; RV32IZHINX-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZHINX-NEXT: addi sp, sp, 16
+; RV32IZHINX-NEXT: ret
+;
+; RV64IZHINX-LABEL: tan_f16:
+; RV64IZHINX: # %bb.0:
+; RV64IZHINX-NEXT: addi sp, sp, -16
+; RV64IZHINX-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IZHINX-NEXT: fcvt.s.h a0, a0
+; RV64IZHINX-NEXT: call tanf
+; RV64IZHINX-NEXT: fcvt.h.s a0, a0
+; RV64IZHINX-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IZHINX-NEXT: addi sp, sp, 16
+; RV64IZHINX-NEXT: ret
+;
+; RV32I-LABEL: tan_f16:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: slli a0, a0, 16
+; RV32I-NEXT: srli a0, a0, 16
+; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call tanf
+; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV64I-LABEL: tan_f16:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: slli a0, a0, 48
+; RV64I-NEXT: srli a0, a0, 48
+; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call tanf
+; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: ret
+;
+; RV32IZFHMIN-LABEL: tan_f16:
+; RV32IZFHMIN: # %bb.0:
+; RV32IZFHMIN-NEXT: addi sp, sp, -16
+; RV32IZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT: fcvt.s.h fa0, fa0
+; RV32IZFHMIN-NEXT: call tanf
+; RV32IZFHMIN-NEXT: fcvt.h.s fa0, fa0
+; RV32IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT: addi sp, sp, 16
+; RV32IZFHMIN-NEXT: ret
+;
+; RV64IZFHMIN-LABEL: tan_f16:
+; RV64IZFHMIN: # %bb.0:
+; RV64IZFHMIN-NEXT: addi sp, sp, -16
+; RV64IZFHMIN-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IZFHMIN-NEXT: fcvt.s.h fa0, fa0
+; RV64IZFHMIN-NEXT: call tanf
+; RV64IZFHMIN-NEXT: fcvt.h.s fa0, fa0
+; RV64IZFHMIN-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IZFHMIN-NEXT: addi sp, sp, 16
+; RV64IZFHMIN-NEXT: ret
+;
+; RV32IZHINXMIN-LABEL: tan_f16:
+; RV32IZHINXMIN: # %bb.0:
+; RV32IZHINXMIN-NEXT: addi sp, sp, -16
+; RV32IZHINXMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZHINXMIN-NEXT: fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT: call tanf
+; RV32IZHINXMIN-NEXT: fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZHINXMIN-NEXT: addi sp, sp, 16
+; RV32IZHINXMIN-NEXT: ret
+;
+; RV64IZHINXMIN-LABEL: tan_f16:
+; RV64IZHINXMIN: # %bb.0:
+; RV64IZHINXMIN-NEXT: addi sp, sp, -16
+; RV64IZHINXMIN-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IZHINXMIN-NEXT: fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT: call tanf
+; RV64IZHINXMIN-NEXT: fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IZHINXMIN-NEXT: addi sp, sp, 16
+; RV64IZHINXMIN-NEXT: ret
+ %1 = call half @llvm.tan.f16(half %a)
+ ret half %1
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll b/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll
index d214a3af5a151..1d6e073271efa 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll
@@ -377,6 +377,14 @@ define <4 x float> @cos_v4f32(<4 x float> %x) {
ret <4 x float> %v
}
+; CHECK-LABEL: tan_v4f32:
+; CHECK: call $push[[L:[0-9]+]]=, tanf
+declare <4 x float> @llvm.tan.v4f32(<4 x float>)
+define <4 x float> @tan_v4f32(<4 x float> %x) {
+ %v = call <4 x float> @llvm.tan.v4f32(<4 x float> %x)
+ ret <4 x float> %v
+}
+
; CHECK-LABEL: powi_v4f32:
; CHECK: call $push[[L:[0-9]+]]=, __powisf2
declare <4 x float> @llvm.powi.v4f32.i32(<4 x float>, i32)
@@ -469,6 +477,14 @@ define <2 x double> @cos_v2f64(<2 x double> %x) {
ret <2 x double> %v
}
+; CHECK-LABEL: tan_v2f64:
+; CHECK: call $push[[L:[0-9]+]]=, tan
+declare <2 x double> @llvm.tan.v2f64(<2 x double>)
+define <2 x double> @tan_v2f64(<2 x double> %x) {
+ %v = call <2 x double> @llvm.tan.v2f64(<2 x double> %x)
+ ret <2 x double> %v
+}
+
; CHECK-LABEL: powi_v2f64:
; CHECK: call $push[[L:[0-9]+]]=, __powidf2
declare <2 x double> @llvm.powi.v2f64.i32(<2 x double>, i32)
diff --git a/llvm/test/Transforms/LoopVectorize/intrinsic.ll b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
index 0f070347dd4ef..9c910d70807a1 100644
--- a/llvm/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
@@ -162,6 +162,60 @@ for.end: ; preds = %for.body, %entry
declare double @llvm.cos.f64(double)
+define void @tan_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @tan_f32(
+; CHECK: llvm.tan.v4f32
+; CHECK: ret void
+;
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body, label %for.end
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
+ %0 = load float, ptr %arrayidx, align 4
+ %call = tail call float @llvm.tan.f32(float %0)
+ %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
+ store float %call, ptr %arrayidx2, align 4
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %n
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare float @llvm.tan.f32(float)
+
+define void @tan_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @tan_f64(
+; CHECK: llvm.tan.v4f64
+; CHECK: ret void
+;
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %for.body, label %for.end
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
+ %0 = load double, ptr %arrayidx, align 8
+ %call = tail call double @llvm.tan.f64(double %0)
+ %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
+ store double %call, ptr %arrayidx2, align 8
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %n
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare double @llvm.tan.f64(double)
+
define void @exp_f32(i32 %n, ptr %y, ptr %x) {
; CHECK-LABEL: @exp_f32(
; CHECK: llvm.exp.v4f32
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
index 6db27e597a63f..eae38295ba08c 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll
@@ -548,13 +548,11 @@ define <4 x float> @tan_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
index 24e16deacb3af..5e2dd305f0557 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll
@@ -548,13 +548,11 @@ define <4 x float> @tan_4x(ptr %a) {
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @tanf(float [[VECEXT_2]])
-; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @tanf(float [[VECEXT_3]])
-; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
+; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP3]])
+; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
;
entry:
%0 = load <4 x float>, ptr %a, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/call.ll b/llvm/test/Transforms/SLPVectorizer/X86/call.ll
index 4181148a4d829..8835e3b144be6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/call.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/call.ll
@@ -6,6 +6,7 @@ target triple = "x86_64-apple-macosx10.8.0"
declare double @sin(double) nounwind willreturn
declare double @cos(double) nounwind willreturn
+declare double @tan(double) nounwind willreturn
declare double @pow(double, double) nounwind willreturn
declare double @exp2(double) nounwind willreturn
declare double @sqrt(double) nounwind willreturn
@@ -48,6 +49,24 @@ define void @cos_libm(ptr %a, ptr %b) {
ret void
}
+define void @tan_libm(ptr %a, ptr %b) {
+; CHECK-LABEL: @tan_libm(
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.tan.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[B:%.*]], align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load double, ptr %a, align 8
+ %idx1 = getelementptr inbounds double, ptr %a, i64 1
+ %a1 = load double, ptr %idx1, align 8
+ %tan1 = tail call double @tan(double %a0) nounwind readnone
+ %tan2 = tail call double @tan(double %a1) nounwind readnone
+ store double %tan1, ptr %b, align 8
+ %idx2 = getelementptr inbounds double, ptr %b, i64 1
+ store double %tan2, ptr %idx2, align 8
+ ret void
+}
+
define void @pow_libm(ptr %a, ptr %b) {
; CHECK-LABEL: @pow_libm(
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Split split the backend changes into a separate PR.
Add a default f16 type promotion
4e45379
to
62d5393
Compare
@nikic are there any other changes you want? |
@alexey-bataev if you have time could you take a look at this PR? |
On a PTO, will return next week |
Can we have some more tests for other targets too? Would be good to have a test for a target that does not support vectorization of tan |
The tan llvm inttrinsic was defined in
This change is just linking the tan libfuncs to the intrinsic so the libfuncs can also be vectorized. As written there is no backend specific behavior. It is going to vectorize tan across all backends. In other words there doesn't appear to be any way to mark a libfunc or an intrinsic as |
Need to be sure that the cost of the vector version is high enough for the targets that do not support it. Otherwise, they may suffer from the perf drop. And still need to add the test, especially codegen (if still not) to be sure that the targets can lower the vector versions correctly. |
I'm not familar with a way to check for perf impact across all backends. Could you give some guidance on how I could figure that out to answer this question? I'm not familar enough with most these backends to know which ones needs tests. Trig functions seem to only be tested on As for which backends support vectorization, I'm going to assume we can limit backends to what exists in I think a partial list can be figured out from here: llvm-project/llvm/include/llvm/Analysis/TargetLibraryInfo.h Lines 124 to 134 in 62d5393
So that leads me to believe there is vectoization support on There also might be some RISCV support based on what I found for
So if we subtract what we know supports vectorization from the full list of tests we are left with
So one thing of note i discovered is that sin\cos are SLPvectorized despite the fact that some of these backends do not support it. That makes me wonder if sin\cos vectorization should be removed or if it is ok that tan can be vectorized even if it lacks support across all backends. |
I had the patch for this issue some time ago (https://reviews.llvm.org/D154738, see @RKSimon response), we still need supporting this, such nodes should not be vectorized. Ok, let's keep it as is for now |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG
@nikic you are listed as code owner, are you happy with these changes aswell? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/72/builds/520 Here is the relevant piece of the build log for the reference:
|
llvm#95517) This PR is intended to address the limited SLPVectorizer support of tan raised in the comments of this PR: llvm#94559. Right now emitting the tan intrinsisic allows you to vectorize tan, but emitting the libfunc does not. to address this the libcall needs to be mapped to the intrinsic. and the libcall and function name need to be marked approriately so they can be optimized or defined as a call lowering.
This PR is intended to address the limited SLPVectorizer support of tan raised in the comments of this PR: #94559.
Right now emitting the tan intrinsisic allows you to vectorize tan, but emitting the libfunc does not. to address this the libcall needs to be mapped to the intrinsic. and the libcall and function name need to be marked approriately so they can be optimized or defined as a call lowering.